/* vim:set ts=2 sw=2 sts=2 et: */
/**
 * \author     Marcus Holland-Moritz (github@mhxnet.de)
 * \copyright  Copyright (c) Marcus Holland-Moritz
 *
 * This file is part of dwarfs.
 *
 * dwarfs is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * dwarfs is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with dwarfs.  If not, see <https://www.gnu.org/licenses/>.
 *
 * SPDX-License-Identifier: GPL-3.0-or-later
 */

#include <algorithm>
#include <array>
#include <numeric>
#include <random>
#include <string_view>

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <dwarfs/internal/fsst.h>

using namespace std::string_view_literals;
using namespace dwarfs::internal;

namespace {

constexpr std::array<std::string_view, 1000> test_strings{
    "aburabozu",
    "abuzz",
    "acacatechol",
    "acclamator",
    "accumulatively",
    "acephalan",
    "acetaldehydrase",
    "acetone",
    "ackman",
    "acquaintant",
    "acquisited",
    "acraniate",
    "acushla",
    "adamantoma",
    "addability",
    "adipomatous",
    "adjectively",
    "administrative",
    "Adramelech",
    "aku",
    "alaihi",
    "alburnum",
    "alcogene",
    "alcoholdom",
    "alcoholometric",
    "alliciency",
    "alloerotism",
    "allowedly",
    "alluring",
    "alpenhorn",
    "alphabetics",
    "alternariose",
    "Amasta",
    "amberoid",
    "ambidexterity",
    "ambient",
    "Ambystoma",
    "anaerobia",
    "analogically",
    "anamnionic",
    "Anaryan",
    "Anastasia",
    "aniseikonia",
    "Anophelinae",
    "antebridal",
    "antiministerialist",
    "antivenom",
    "antivermicular",
    "antoeci",
    "aphorize",
    "aphrizite",
    "apsidal",
    "aquopentamminecobaltic",
    "ardeb",
    "areometric",
    "argumentatory",
    "armpit",
    "arteriostenosis",
    "Arthurian",
    "Aruac",
    "asbestoidal",
    "aspirata",
    "assise",
    "astigmatical",
    "asynaptic",
    "asystolic",
    "atomician",
    "attachment",
    "attingence",
    "aurothiosulphuric",
    "autoanalytic",
    "autoinduction",
    "automata",
    "autosymbolic",
    "avenalin",
    "axmanship",
    "azine",
    "babloh",
    "babuina",
    "babyishly",
    "Bacchides",
    "bacteriform",
    "Baniva",
    "baronetical",
    "bathroomed",
    "bauta",
    "beaminess",
    "beamwork",
    "becircled",
    "bedrop",
    "bepearl",
    "bereaven",
    "besieging",
    "betinge",
    "Bharata",
    "bibliothecal",
    "bicuspid",
    "binarium",
    "birch",
    "Birkeniidae",
    "bishopling",
    "blacklegs",
    "blandness",
    "Blankit",
    "blenching",
    "blockheadedly",
    "Blumea",
    "blunderful",
    "bonnyvis",
    "boobery",
    "botanize",
    "botryoid",
    "bountifulness",
    "brachystaphylic",
    "Brachystomata",
    "branchful",
    "Branchiopoda",
    "braunite",
    "breeder",
    "brideweed",
    "broadpiece",
    "bronchomucormycosis",
    "Buddleia",
    "buffer",
    "bullionless",
    "bump",
    "burnt",
    "burtonize",
    "butlerage",
    "cacodemon",
    "calangay",
    "calfling",
    "canniness",
    "Cantabrize",
    "capocchia",
    "carapacic",
    "carnotite",
    "carpentering",
    "caryophyllous",
    "cashcuttee",
    "castlet",
    "categorist",
    "causticity",
    "cavate",
    "cavernous",
    "cecidologist",
    "centiliter",
    "cephalopathy",
    "Cercolabidae",
    "cerebrosensorial",
    "Cesare",
    "Chaouia",
    "chapatty",
    "chargeman",
    "chati",
    "chatteration",
    "cheecha",
    "chest",
    "cheve",
    "chiastic",
    "chiastoneury",
    "chlorite",
    "chronal",
    "churnmilk",
    "circuity",
    "circumoral",
    "clackety",
    "clearstarch",
    "Cleistothecopsis",
    "clifflike",
    "clitelline",
    "clithe",
    "coaration",
    "codomestication",
    "coferment",
    "cog",
    "Colchis",
    "collocatory",
    "colombier",
    "colophonium",
    "colpindach",
    "concern",
    "concessible",
    "conjugal",
    "conjury",
    "consenting",
    "constable",
    "continentality",
    "contraponend",
    "Contraposaune",
    "conventionally",
    "coprecipitation",
    "coprophilous",
    "corespect",
    "cossette",
    "cotranslator",
    "cottonbush",
    "councilorship",
    "coverchief",
    "crampy",
    "craniovertebral",
    "Craterid",
    "creamy",
    "credulity",
    "criticship",
    "cubbishly",
    "cunila",
    "Cyanastraceae",
    "cyanaurate",
    "cyanochroia",
    "cyanole",
    "cylindrograph",
    "Cypselid",
    "dakir",
    "dasturi",
    "dealkylate",
    "deaminize",
    "decretum",
    "dehors",
    "demipike",
    "Dendroidea",
    "diaphony",
    "dicing",
    "diglyceride",
    "Dioon",
    "diphenylchloroarsine",
    "Disamis",
    "disassociation",
    "discircumspection",
    "discursiveness",
    "disdiaclast",
    "disembower",
    "disfashion",
    "dishonorably",
    "disroot",
    "distastefulness",
    "distinctly",
    "distortionist",
    "disyllabic",
    "divertibility",
    "doored",
    "dorsoventral",
    "doughlike",
    "downstairs",
    "dragade",
    "drugless",
    "Dryope",
    "duke",
    "dusting",
    "earthed",
    "eatable",
    "ebracteate",
    "ectopic",
    "eelworm",
    "elect",
    "electrion",
    "electrocontractility",
    "electromerism",
    "electropotential",
    "elusive",
    "embolium",
    "emissile",
    "Emmental",
    "Empidonax",
    "emptor",
    "enclitical",
    "endotheca",
    "endurer",
    "enjoyably",
    "epistemological",
    "epizoan",
    "equalist",
    "equally",
    "equerry",
    "equiangular",
    "equinoctially",
    "equiparant",
    "Eriophorum",
    "erotic",
    "Esopus",
    "espadon",
    "ethmovomerine",
    "euphemist",
    "Europasian",
    "evagation",
    "excusator",
    "exemplarily",
    "exhalatory",
    "exiguity",
    "expectance",
    "expeditiousness",
    "extemporally",
    "fabiform",
    "facultate",
    "fallacy",
    "fantoccini",
    "fanwork",
    "fastland",
    "Faustian",
    "fawnskin",
    "fetch",
    "ficklety",
    "figurize",
    "Filipiniana",
    "fingery",
    "finiteness",
    "flayer",
    "flindosy",
    "flinger",
    "flinthearted",
    "flogging",
    "folliful",
    "foodstuff",
    "foredeck",
    "forewoman",
    "fortieth",
    "fortin",
    "fraternally",
    "freeholdership",
    "freewill",
    "Fremontia",
    "friarly",
    "Friulian",
    "fuguist",
    "fulgentness",
    "furfuraceously",
    "furiosa",
    "galant",
    "galany",
    "gastrocnemius",
    "gaywings",
    "gazelle",
    "geminiform",
    "generic",
    "geologize",
    "geophilous",
    "germal",
    "gerontocrat",
    "gien",
    "glaucin",
    "gleefully",
    "Gliridae",
    "glottogony",
    "Glyconian",
    "gnatty",
    "gobiesocid",
    "gonoplasm",
    "granula",
    "gudewife",
    "Guisard",
    "gumwood",
    "gurgle",
    "Gyges",
    "haggardly",
    "hammerwort",
    "hammochrysos",
    "hangingly",
    "haptene",
    "hardpan",
    "harr",
    "hashish",
    "hauchecornite",
    "helleborein",
    "hemiplegy",
    "hent",
    "herborization",
    "heroify",
    "Hesperian",
    "heteroerotism",
    "histology",
    "hoboism",
    "honoree",
    "hookheal",
    "horned",
    "houseminder",
    "huantajayite",
    "hubmaking",
    "hunkerous",
    "Huterian",
    "hyaenodont",
    "hybridization",
    "hydroboracite",
    "hymenopterologist",
    "hypostilbite",
    "ichthyopolist",
    "idiomorphism",
    "idoneous",
    "immanity",
    "immeritorious",
    "impartiality",
    "impersonate",
    "improvisatorially",
    "impuberal",
    "inclinableness",
    "inconsequential",
    "incopresentable",
    "incrustant",
    "incurvation",
    "indecorous",
    "indictee",
    "informant",
    "infracentral",
    "ingeldable",
    "inherently",
    "initially",
    "initiation",
    "inscribableness",
    "insocially",
    "intercreate",
    "interisland",
    "interzooecial",
    "introsentient",
    "inversed",
    "investment",
    "invigor",
    "ironheartedly",
    "isomerical",
    "isospondylous",
    "itatartrate",
    "jadery",
    "janitor",
    "Jebusi",
    "jimpness",
    "jinny",
    "Jo",
    "jugulum",
    "kale",
    "kalymmocyte",
    "kelyphite",
    "kerbstone",
    "kettle",
    "khedive",
    "Koelreuteria",
    "Koreshan",
    "kuttar",
    "lairdess",
    "Lappish",
    "latch",
    "Latinize",
    "laudatorily",
    "laumontite",
    "lavaret",
    "leaky",
    "legislative",
    "legislatorial",
    "leoncito",
    "leopard",
    "lipoid",
    "liroconite",
    "livingness",
    "loasaceous",
    "loathness",
    "logarithmetically",
    "logorrhea",
    "loquacious",
    "lotto",
    "lowerable",
    "lycoperdaceous",
    "maintainer",
    "Malaclemys",
    "mammalogist",
    "maney",
    "Margery",
    "marron",
    "mastoidohumeral",
    "mauger",
    "mazzard",
    "meered",
    "melicerous",
    "meningomyelitis",
    "merocrystalline",
    "mesogyrate",
    "mesolabe",
    "mesothermal",
    "metacresol",
    "meteorical",
    "metronomic",
    "Michigander",
    "microchemical",
    "micropolariscope",
    "microtomic",
    "mildewer",
    "misdo",
    "misemphasis",
    "misgovernance",
    "misrender",
    "monoid",
    "mooncreeper",
    "moratory",
    "morbidity",
    "mottramite",
    "moundlet",
    "muleman",
    "multiplex",
    "multitudinal",
    "musquaw",
    "myope",
    "Myrcia",
    "mythogonic",
    "Nabalitic",
    "nailproof",
    "naipkin",
    "nasociliary",
    "Nearctica",
    "neophilological",
    "neuromyelitis",
    "nickelic",
    "nidology",
    "niello",
    "niggardize",
    "nonacquittal",
    "nonadult",
    "noncoloring",
    "nonconducive",
    "noncreeping",
    "noncurling",
    "nondegeneration",
    "nongraduated",
    "nonheritor",
    "nonoccupation",
    "nonplanar",
    "nonprevalence",
    "nonretiring",
    "nonrhyming",
    "nonsecretory",
    "nonspecial",
    "nonsubstantiation",
    "norbergite",
    "Notus",
    "nucleon",
    "number",
    "nuncupatively",
    "nymphid",
    "Observantist",
    "odontonosology",
    "offendant",
    "Oklahoma",
    "oligosite",
    "omniparity",
    "oncosis",
    "ophthalmiatrics",
    "ophthalmitis",
    "opposure",
    "orendite",
    "Orientalia",
    "ornithosaurian",
    "orthosemidine",
    "orthotactic",
    "Oryza",
    "oscheocele",
    "osse",
    "ostempyesis",
    "ostreoid",
    "Otariinae",
    "outcropper",
    "outsmart",
    "outsuck",
    "outwander",
    "overcolor",
    "overdeeming",
    "overdrowsed",
    "overjawed",
    "overpitched",
    "overpole",
    "overremissness",
    "overspring",
    "oversqueak",
    "oversystematic",
    "overtrump",
    "oxberry",
    "oxyketone",
    "palpiform",
    "Panak",
    "pancreatotomy",
    "Panorpidae",
    "Pantagruel",
    "Pantagruelically",
    "pantamorphic",
    "pantochromism",
    "pantophile",
    "papaverous",
    "Paradoxides",
    "paranymphal",
    "parasitotropic",
    "parfilage",
    "Parnassus",
    "partisan",
    "partitive",
    "pathoanatomical",
    "pauseful",
    "pedagogism",
    "Pedetidae",
    "pejorate",
    "pelican",
    "pelmatogram",
    "peltiferous",
    "pendragon",
    "pensive",
    "pentaspherical",
    "Percheron",
    "periphyllum",
    "peritomize",
    "peritonsillitis",
    "pervasively",
    "Petiolata",
    "phalarope",
    "pharmacognosia",
    "Phenalgin",
    "philomystic",
    "Pholadacea",
    "phonophotography",
    "photographize",
    "photolysis",
    "photometrograph",
    "phraseologically",
    "phrenic",
    "Phyteus",
    "phytomorphic",
    "pietistic",
    "pikle",
    "pinacone",
    "pinsons",
    "plasterer",
    "play",
    "plenicorn",
    "pleomastia",
    "plessimeter",
    "pleuroperitonaeal",
    "plexiform",
    "plumade",
    "pluviometrical",
    "pneumony",
    "podder",
    "podophthalmitic",
    "pokable",
    "Polistes",
    "porcellanid",
    "postspinous",
    "potto",
    "powwower",
    "praesystolic",
    "pram",
    "preaccomplishment",
    "preanterior",
    "preboding",
    "precordiality",
    "predeficient",
    "pregranite",
    "prehistorics",
    "preliability",
    "premeditative",
    "prepatriotic",
    "presbytic",
    "prespecialist",
    "proceed",
    "Proctotrypidae",
    "proextension",
    "profitlessness",
    "projecture",
    "promptbook",
    "proreduction",
    "prosodiac",
    "protomorph",
    "protosiphonaceous",
    "provoker",
    "proxenos",
    "proximally",
    "Prunella",
    "prunelle",
    "pseudocartilaginous",
    "Pseudopeziza",
    "pseudosocialistic",
    "pseudosyllogism",
    "psychoautomatic",
    "Pteranodon",
    "Ptolemaic",
    "pulverization",
    "pyrochlore",
    "quibble",
    "quinize",
    "quintette",
    "quintile",
    "Rajah",
    "Rastaban",
    "rebato",
    "Rebecca",
    "rebolt",
    "reburn",
    "recarburization",
    "receptionism",
    "recession",
    "recipient",
    "redjacket",
    "reflorescent",
    "refusion",
    "regimentalled",
    "Reichslander",
    "remilitarize",
    "remindal",
    "renomination",
    "repersuade",
    "repertorium",
    "replenisher",
    "representable",
    "reprise",
    "reserved",
    "resmell",
    "reticulovenose",
    "retrace",
    "retraxit",
    "retrenchable",
    "reventilate",
    "rhabdomal",
    "Rhaetian",
    "rhubarb",
    "Rhynchospora",
    "Ribhus",
    "ricksha",
    "rimose",
    "Russolatry",
    "saccharomyces",
    "saddlery",
    "sagacious",
    "samkara",
    "sauntering",
    "Sciarinae",
    "scoon",
    "scranning",
    "scribblatory",
    "scride",
    "Scriptureless",
    "scullionish",
    "seamanship",
    "seashore",
    "sedentarily",
    "selvaged",
    "sematic",
    "semiantique",
    "semicollar",
    "semigenuflection",
    "semiorb",
    "semiordinate",
    "semioxidated",
    "semiproof",
    "semiquadrantly",
    "semisociative",
    "semitheological",
    "semuncia",
    "sensal",
    "septarian",
    "seriation",
    "serpentina",
    "serranoid",
    "shaftman",
    "Shakespeareana",
    "shandrydan",
    "sheepbiter",
    "Shetlandic",
    "shoddywards",
    "showless",
    "sifting",
    "signifier",
    "sinoauricular",
    "siphonapterous",
    "siphonosome",
    "sittee",
    "smellage",
    "Smyrniot",
    "sniffing",
    "snubbishness",
    "soapberry",
    "sociologizer",
    "softball",
    "solemnize",
    "solitudinize",
    "somatical",
    "somnolently",
    "sooky",
    "soonish",
    "sparsely",
    "spathed",
    "speechmaking",
    "spellword",
    "Sphaerocarpus",
    "sphindid",
    "splanchnodynia",
    "splenocyte",
    "spondylexarthrosis",
    "spongiolin",
    "sporeling",
    "spotted",
    "squireless",
    "stachys",
    "Stalinism",
    "stampweed",
    "stannate",
    "stanner",
    "statesmanship",
    "stauracin",
    "stenosed",
    "stereoscopically",
    "stickwater",
    "Stilophora",
    "stimulability",
    "stonify",
    "storkish",
    "stoutly",
    "stove",
    "strenuousness",
    "strongbox",
    "sturdiness",
    "sufflation",
    "sulfamethazine",
    "sunshining",
    "supercarbonate",
    "superfluousness",
    "superfortunate",
    "superreliance",
    "supramaxilla",
    "surinamine",
    "surprisable",
    "surrebut",
    "swapping",
    "Swazi",
    "swingable",
    "Synchytrium",
    "syndesmology",
    "syntaxist",
    "tabor",
    "tairn",
    "tangle",
    "Tantony",
    "tartaret",
    "teammate",
    "tearable",
    "telecommunication",
    "telford",
    "tempre",
    "tender",
    "testicle",
    "thegnly",
    "theoretician",
    "theosophism",
    "Thiobacillus",
    "throatroot",
    "Thunnidae",
    "tidewater",
    "Timonian",
    "Timuquan",
    "tolerable",
    "tonicobalsamic",
    "tonsillectomize",
    "toolstock",
    "tournant",
    "trabacolo",
    "tragicomicality",
    "tramway",
    "translative",
    "transmigrationist",
    "trianthous",
    "trichitis",
    "tricoryphean",
    "trimesitic",
    "trionychoid",
    "tristichous",
    "trona",
    "Tsuga",
    "turbaned",
    "turkeyberry",
    "twangy",
    "ultraconfident",
    "ultraconservative",
    "Ulvales",
    "unalimentary",
    "unamply",
    "unauthentic",
    "unbold",
    "unceremented",
    "uncially",
    "uncompact",
    "unconcernment",
    "unconsoling",
    "uncultured",
    "undecreed",
    "undefinedly",
    "undeformedness",
    "undenominated",
    "undercharged",
    "underpassion",
    "undevelopable",
    "unduncelike",
    "unduty",
    "unexcitable",
    "unfanned",
    "unfence",
    "unfighting",
    "unglorious",
    "ungrow",
    "unhaste",
    "unifocal",
    "unilabiated",
    "unimperialistic",
    "unimposedly",
    "unincarnate",
    "unliquid",
    "unmechanize",
    "unmellowed",
    "unmistakable",
    "unmuddle",
    "unnagging",
    "unnegotiableness",
    "unobstruct",
    "unobtrusiveness",
    "unorganically",
    "unperishably",
    "unplacid",
    "unpolled",
    "unpossessed",
    "unprivileged",
    "unpronounced",
    "unproportioned",
    "unpurged",
    "unreclined",
    "unregretted",
    "unremittingness",
    "unrepresentedness",
    "unruled",
    "unsalutary",
    "unsalvability",
    "unsanctify",
    "unsaponified",
    "unseated",
    "unseldom",
    "unshavenly",
    "unsolvable",
    "unstopper",
    "unsung",
    "unsupplicated",
    "untaintedness",
    "untenty",
    "unthaw",
    "untrainedly",
    "untranspassable",
    "unvetoed",
    "unvocalized",
    "unwalled",
    "unwarlike",
    "unwhisked",
    "unwrathful",
    "uphoard",
    "upprick",
    "uptrain",
    "upwork",
    "upwreathe",
    "urbanely",
    "ureometry",
    "ureterocystoscope",
    "urethroscopy",
    "urological",
    "urticarial",
    "usara",
    "vacantry",
    "vaccinogenous",
    "valeramide",
    "valonia",
    "vanillinic",
    "velate",
    "viewless",
    "visceroparietal",
    "vituperative",
    "vocably",
    "volatilizable",
    "voucher",
    "Wagneriana",
    "waketime",
    "walleye",
    "wappenschaw",
    "waxweed",
    "wear",
    "weatherer",
    "weave",
    "werewolfism",
    "wheem",
    "whippletree",
    "whistlewing",
    "whom",
    "wicking",
    "widowership",
    "windwayward",
    "wisehearted",
    "workbench",
    "worldish",
    "worsening",
    "xenian",
    "yachting",
    "Yugoslavic",
    "zebu",
    "zimme",
    "zoocurrent",
    "zoopraxiscope",
};

constexpr auto total_string_length = std::accumulate(
    test_strings.begin(), test_strings.end(), 0,
    [](size_t sum, std::string_view str) { return sum + str.size(); });

} // namespace

TEST(fsst_test, basic) {
  auto const res = fsst_encoder::compress(test_strings);

  ASSERT_TRUE(res.has_value());
  EXPECT_EQ(test_strings.size(), res->compressed_data.size());
  EXPECT_GT(res->dictionary.size(), 550);
  EXPECT_LT(res->dictionary.size(), 600);
  EXPECT_LT(res->buffer.size(), 9 * total_string_length / 17);

  auto const decoder = fsst_decoder{res->dictionary};

  for (size_t i = 0; i < test_strings.size(); ++i) {
    auto const& str = test_strings[i];
    auto const& compressed_data = res->compressed_data[i];

    auto const decompressed = decoder.decompress(compressed_data);

    EXPECT_EQ(str, decompressed);
  }
}

TEST(fsst_random_test, random_strings) {
#ifdef DWARFS_TEST_CROSS_COMPILE
  static constexpr int num_random_tests = 100;
#else
  static constexpr int num_random_tests = 1000;
#endif

  std::mt19937 rng{42};
  std::uniform_int_distribution<size_t> sample_size_dist(0, 100);
  std::vector<size_t> sample_sizes;

  sample_sizes.reserve(num_random_tests);
  sample_sizes.push_back(0); // Definitely include the empty set
  std::ranges::generate_n(std::back_inserter(sample_sizes),
                          num_random_tests - 1,
                          [&]() { return sample_size_dist(rng); });

  for (auto const sample_size : sample_sizes) {
    std::vector<std::string_view> input(sample_size);
    std::ranges::sample(test_strings, input.begin(), input.size(), rng);

    auto const res = fsst_encoder::compress(input, true);
    auto const res2 = fsst_encoder::compress(input);

    if (sample_size == 0) {
      ASSERT_FALSE(res.has_value());
      ASSERT_FALSE(res2.has_value());
    } else {
      ASSERT_TRUE(res.has_value());
      EXPECT_EQ(input.size(), res->compressed_data.size());

      auto const total_input_length =
          std::accumulate(input.begin(), input.end(), size_t{0},
                          [](size_t n, auto const& s) { return n + s.size(); });

      if (res->dictionary.size() + res->buffer.size() < total_input_length) {
        EXPECT_TRUE(res2.has_value());
      } else {
        EXPECT_FALSE(res2.has_value());
      }

      if (sample_size >= 500) {
        EXPECT_LE(res->buffer.size(), 60 * total_input_length / 100);
      } else if (sample_size >= 200) {
        EXPECT_LE(res->buffer.size(), 70 * total_input_length / 100);
      } else if (sample_size >= 100) {
        EXPECT_LE(res->buffer.size(), 100 * total_input_length / 100);
      } else if (sample_size >= 20) {
        EXPECT_LE(res->buffer.size(), 120 * total_input_length / 100);
      } else {
        EXPECT_LE(res->buffer.size(), 200 * total_input_length / 100);
      }

      auto const decoder = fsst_decoder{res->dictionary};

      for (size_t i = 0; i < input.size(); ++i) {
        auto const& str = input[i];
        auto const& compressed_data = res->compressed_data[i];

        auto const decompressed = decoder.decompress(compressed_data);

        EXPECT_EQ(str, decompressed);
      }
    }
  }
}
